Building random forest models using inputs derived from a DEM and a set of landslide initiation points to predict which areas are more susceptible to landslides.
library(terra)
## terra version 1.4.22
library(TerrainWorksUtils)
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
library(here)
## here() starts at C:/Work/Source/LandslideSusceptibility
library(ggmap)
## Loading required package: ggplot2
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
##
## margin
## Google's Terms of Service: https://cloud.google.com/maps-platform/terms/.
## Please cite ggmap if you use it! See citation("ggmap") for details.
##
## Attaching package: 'ggmap'
## The following object is masked from 'package:terra':
##
## inset
for (file in list.files(here("helper"))) {
source(here("helper", file))
}
dataDir <- "E:/NetmapData/Scottsburg"
shade_file <- file.path(dataDir, params$shade_file)
varRasterFiles <- lapply(params$input_rasters, function(x) {
file.path(dataDir, x)
})
analysis_region_params <- params$analysis_region_parameters
initPointsFile = file.path(dataDir, params$init_points_file)
noninitProportion = params$noninit_proportion
bufferRadius = params$buffer_radius
bufferExtractionMethod = params$buffer_extraction
initRangeExpansion = params$init_range_expansion
iterationsCount = params$iterations_count
shade_file: E:/NetmapData/Scottsburg/shade_scottsburg.tif
varRasterFiles: E:/NetmapData/Scottsburg/dev_15.tif, E:/NetmapData/Scottsburg/prof_15.tif, E:/NetmapData/Scottsburg/grad_15.tif, E:/NetmapData/Scottsburg/plan_15.tif, E:/NetmapData/Scottsburg/pca_15m_48hr.flt, E:/NetmapData/Scottsburg/pca_15m_5hr.flt
analysisRegionParams: grad_15, plan_15
initPOintsFile: E:/NetmapData/Scottsburg/Scottsburg_Upslope.shp
noninitProportion: 3
bufferRadius: 30
bufferExtractionMethod: center cell
initRangeExpansion: 0
iterationsCount: 5
Required inputs: DEM for a region and a set of landslide initiation points. For this exploration, we are using the Scottsburg dataset, which was collected near Scottsburg, OR, including a DEM of the region and 74 landslide initiation points collected in the field.
# get background map
terrain <- terra::rast(
ggmap::get_map(c(-123.90, 43.5978, -123.80, 43.66),
source = "osm",
messaging = FALSE))
## Source : http://tile.stamen.com/terrain/13/1276/2989.png
## Source : http://tile.stamen.com/terrain/13/1277/2989.png
## Source : http://tile.stamen.com/terrain/13/1278/2989.png
## Source : http://tile.stamen.com/terrain/13/1276/2990.png
## Source : http://tile.stamen.com/terrain/13/1277/2990.png
## Source : http://tile.stamen.com/terrain/13/1278/2990.png
## Source : http://tile.stamen.com/terrain/13/1276/2991.png
## Source : http://tile.stamen.com/terrain/13/1277/2991.png
## Source : http://tile.stamen.com/terrain/13/1278/2991.png
# Get hillshade
shade <- terra::rast(file.path(dataDir, "shade_scottsburg.tif"))
# Get points
initPoints <- terra::vect(initPointsFile)
# Match projections
terrain <- terra::project(terrain, crs(shade))
plotRGB(terrain)
plot(shade, col=grey(0:100/100), legend = FALSE, add= TRUE)
points(initPoints, cex = 2, col = "#c44a41")
Surface metrics are calculated from the DEM using the DEMutilities toolbox from the ForestedWetlands repo. Here, we use gradient, plan curvature, profile curvature, and elevation deviation at the 15 meter scale, and partial contributing area at the 15-meter scale calculated for a 5 hour and 48 hour period. This gives us an 6 input rasters.
We also need a set of non-initiation training points. To derive a set of meaningful non-initation points, we first exclude all points outside of the range of values where landslides can be found based on the range of input values for all landslide initation points. This gives us our “analysis region.”
varRasterList <- lapply(varRasterFiles, function(file) terra::rast(file))
# Align variable rasters
varRasterList <- TerrainWorksUtils::alignRasters(shade, varRasterList)
# Combine variable rasters into a single multi-layer raster
varsRaster <- terra::rast(varRasterList)
initBuffers <- terra::buffer(initPoints, width = bufferRadius)
if (length(analysis_region_params) > 0) {
# Calculate initiation range for each variable
initRange <- createInitiationRange(
terra::subset(varsRaster, analysis_region_params),
initBuffers,
initRangeExpansion
)
initRange
# Identify cells in the study region that have variable values within their
# initiation ranges
initRaster <- terra::subset(varsRaster, analysis_region_params)
for (varName in names(initRaster)) {
varRaster <- initRaster[[varName]]
# Get variable value limits
minInitValue <- initRange[varName, "min"]
maxInitValue <- initRange[varName, "max"]
# NA-out cells with values outside variable initiation range
varInitRaster <- terra::app(varRaster, function(x) {
ifelse(x < minInitValue | x > maxInitValue, NA, x)
})
# Update the raster in the input raster stack
initRaster[[varName]] <- varInitRaster
}
# NA-out cells with ANY variable value outside its initiation range
analysisRegionMask <- all(initRaster)
} else {
analysisRegionMask <- all(varsRaster)
}
We expanded the initiation range by 0 percent.
if (length(analysis_region_params) > 0) {
for (layer in names(initRaster)) {
plot(!is.na(initRaster[layer]),
main = layer,
axes = FALSE)
}
}
plot(analysisRegionMask,
main = "Analysis Region",
axes = FALSE)
# analysisCountRaster <- terra::app(initRaster,
# function(x) sum(!is.na(x)))
# plot(analysisCountRaster,
# col = c(RColorBrewer::brewer.pal(
# length(unique(values(analysisCountRaster)))-1,
# "YlGnBu"),
# "#eb972a"
# ),
# axes = FALSE)
We then draw a buffer around initiation points and randomly sample non-initiation points from the remaining analysis region.
# Double the size of the initiation buffers
expInitBuffers <- terra::buffer(initPoints,
width = bufferRadius * 2)
# Remove expanded initiation buffers from the viable non-initiation region
noninitRegion <- terra::copy(analysisRegionMask)
initCellIndices <- terra::extract(noninitRegion,
expInitBuffers,
cells = TRUE)$cell
noninitRegion[initCellIndices] <- NA
# Determine how many to generate
noninitBuffersCount <- ceiling(length(initPoints) * noninitProportion)
noninitBuffers <- generateNoninitiationBuffers(
noninitBuffersCount,
noninitRegion,
bufferRadius
)
## Warning: [spatSample] fewer cells returned than requested
## Warning: [spatSample] fewer cells returned than requested
## Warning: [spatSample] fewer cells returned than requested
## Warning: [spatSample] fewer cells returned than requested
## Warning: [spatSample] fewer cells returned than requested
plotRGB(terrain)
plot(shade, col=grey(0:100/100), legend = FALSE, add= TRUE)
#plot(analysisRegionMask, add = TRUE, col = "#377eb8")
points(initBuffers, col = "#fb8072")
points(noninitBuffers, col = "#a6d854")
legend("topright",
legend = c("Analysis Region",
"Initiation Points",
"Non-initation Points"),
fill = c("#377eb8", NA, NA),
border = c(1, "transparent", "transparent"),
col = c(NA, "#fb8072", "#a6d854"),
pch = c(NA, 16, 16),
pt.cex = c(1, 2, 2)
)
After generating the non-initiation points, we can extract input values from the rasters to create our training data.
# Get training data
trainingData <- extractBufferValues(
varsRaster,
initBuffers,
noninitBuffers,
bufferExtractionMethod
)
# Remove coordinates from data
coordsCols <- names(trainingData) %in% c("x", "y")
trainingData <- trainingData[,!coordsCols]
trainingData_no_y <- trainingData[, !grepl("class", names(trainingData))]
cor(trainingData_no_y[trainingData$class == "initiation",])
## dev_15 prof_15 grad_15 plan_15 pca_15m_48hr
## dev_15 1.0000000 0.7362993 0.34397764 -0.6443822 -0.53568435
## prof_15 0.7362993 1.0000000 0.30833727 -0.3634900 -0.21530733
## grad_15 0.3439776 0.3083373 1.00000000 -0.1783508 -0.04878078
## plan_15 -0.6443822 -0.3634900 -0.17835078 1.0000000 0.80307085
## pca_15m_48hr -0.5356843 -0.2153073 -0.04878078 0.8030709 1.00000000
## pca_15m_5hr 0.1158157 0.1530577 0.63672171 0.1379310 0.23944724
## pca_15m_5hr
## dev_15 0.1158157
## prof_15 0.1530577
## grad_15 0.6367217
## plan_15 0.1379310
## pca_15m_48hr 0.2394472
## pca_15m_5hr 1.0000000
cor(trainingData_no_y[trainingData$class == "non-initiation",])
## dev_15 prof_15 grad_15 plan_15 pca_15m_48hr
## dev_15 1.00000000 0.42208357 0.30645502 -0.564268244 -0.3730377
## prof_15 0.42208357 1.00000000 -0.15324743 -0.136460753 0.1652117
## grad_15 0.30645502 -0.15324743 1.00000000 -0.087208619 -0.2915351
## plan_15 -0.56426824 -0.13646075 -0.08720862 1.000000000 0.5435034
## pca_15m_48hr -0.37303773 0.16521169 -0.29153508 0.543503439 1.0000000
## pca_15m_5hr 0.03656099 0.03941897 0.26642234 0.005599802 0.1674090
## pca_15m_5hr
## dev_15 0.036560985
## prof_15 0.039418965
## grad_15 0.266422342
## plan_15 0.005599802
## pca_15m_48hr 0.167409000
## pca_15m_5hr 1.000000000
panel.hist <- function(x, ...) {
usr <- par("usr")
on.exit(par(usr))
par(usr = c(usr[1:2], 0, 1.5))
his <- hist(x, plot = FALSE)
breaks <- his$breaks
nB <- length(breaks)
y <- his$counts
y <- y/max(y)
rect(breaks[-nB], 0, breaks[-1], y, ...)
# lines(density(x), col = 2, lwd = 2) # Uncomment to add density lines
}
pairs(trainingData_no_y[trainingData$class == "initiation",],
upper.panel = NULL,
diag.panel = panel.hist,
main = "Initation points")
pairs(trainingData_no_y[trainingData$class == "non-initiation",],
upper.panel = NULL,
diag.panel = panel.hist,
main = "Non-initation points")
pairs(trainingData_no_y,
upper.panel = NULL,
diag.panel = panel.hist,
main = "All points",
col = ifelse(trainingData$class == "initiation", 1, 2))
legend("topright",
legend = c("initiation", "non-initiation"),
pch = 1,
col = c(1, 2))
Next, we fit a random forest model on the training data, and examine some model stats.
rfModel <- randomForest(
formula = class ~ .,
data = trainingData
)
plot(rfModel)
errorRateDf <- data.frame(rfModel$err.rate[rfModel$ntree,])
colnames(errorRateDf) <- "error rate"
errorRateDf
## error rate
## OOB 0.12500000
## initiation 0.39189189
## non-initiation 0.03603604
rownames(rfModel$confusion) <- c("true initiation", "true non-initiation")
rfModel$confusion
## initiation non-initiation class.error
## true initiation 45 29 0.39189189
## true non-initiation 8 214 0.03603604
imp <- importance(rfModel)
imp
## MeanDecreaseGini
## dev_15 19.103924
## prof_15 8.329492
## grad_15 16.262225
## plan_15 46.206514
## pca_15m_48hr 9.420284
## pca_15m_5hr 11.909237
varImpPlot(rfModel)
These plots examine the partial dependence of the model outcome on each variable, plotted in order of greatest to least importance. Higher values indicate that those values for a variable are more likely to result in classifying a point as an initiation point. 0
vars_by_importance <- rownames(imp)[order(imp[, 1], decreasing = TRUE)]
for (variable in vars_by_importance) {
partialPlot(rfModel,
trainingData,
eval(variable),
xlab = variable,
main = paste("Partial Dependence on", variable))
}
Let’s see how the outcome varies depending on the non-initiation points.
iterations <- 1:params$iterations_count
trainingDataList <- lapply(iterations, function(x) {
noninitBuffers <- generateNoninitiationBuffers(
noninitBuffersCount,
noninitRegion,
bufferRadius
)
trainingData <- extractBufferValues(
varsRaster,
initBuffers,
noninitBuffers,
bufferExtractionMethod
)
# Remove coordinates from data
coordsCols <- names(trainingData) %in% c("x", "y")
trainingData[,!coordsCols]
})
## Warning: [spatSample] fewer cells returned than requested
## Warning: [spatSample] fewer cells returned than requested
## Warning: [spatSample] fewer cells returned than requested
## Warning: [spatSample] fewer cells returned than requested
## Warning: [spatSample] fewer cells returned than requested
## Warning: [spatSample] fewer cells returned than requested
## Warning: [spatSample] fewer cells returned than requested
## Warning: [spatSample] fewer cells returned than requested
## Warning: [spatSample] fewer cells returned than requested
## Warning: [spatSample] fewer cells returned than requested
## Warning: [spatSample] fewer cells returned than requested
## Warning: [spatSample] fewer cells returned than requested
## Warning: [spatSample] fewer cells returned than requested
## Warning: [spatSample] fewer cells returned than requested
## Warning: [spatSample] fewer cells returned than requested
## Warning: [spatSample] fewer cells returned than requested
## Warning: [spatSample] fewer cells returned than requested
## Warning: [spatSample] fewer cells returned than requested
## Warning: [spatSample] fewer cells returned than requested
## Warning: [spatSample] fewer cells returned than requested
## Warning: [spatSample] fewer cells returned than requested
## Warning: [spatSample] fewer cells returned than requested
## Warning: [spatSample] fewer cells returned than requested
## Warning: [spatSample] fewer cells returned than requested
## Warning: [spatSample] fewer cells returned than requested
rfModelList <- lapply(trainingDataList, function(d) {
randomForest(
formula = class ~ .,
data = d
)
})
for (i in iterations) {
cat("MODEL ", i)
trainingData <- trainingDataList[[i]]
trainingData_no_y <- trainingData[, 1:6]
rfModel <- rfModelList[[i]]
pairs(trainingData_no_y[trainingData$class == "non-initiation",],
upper.panel = NULL,
diag.panel = panel.hist,
main = "Non-initation points")
cat("\n\nERROR RATE\n")
errorRateDf <- data.frame(rfModel$err.rate[rfModel$ntree,])
colnames(errorRateDf) <- "error rate"
cat(paste0(capture.output(errorRateDf), collapse = "\n"))
cat("\n\nCONFUSION MATRIX\n")
rownames(rfModel$confusion) <- c("true initiation", "true non-initiation")
cat(paste0(capture.output(rfModel$confusion), collapse = "\n"))
cat("\n\nIMPORTANCE\n")
imp <- importance(rfModel)
cat(paste0(capture.output(imp), collapse = "\n"))
varImpPlot(rfModel)
vars_by_importance <- rownames(imp)[order(imp[, 1], decreasing = TRUE)]
par(mfrow = c(3,2))
for (variable in vars_by_importance) {
partialPlot(rfModel,
trainingData,
eval(variable),
xlab = variable,
main = variable)
}
par(mfrow = c(1,1))
}
## MODEL 1
##
##
## ERROR RATE
## error rate
## OOB 0.11525424
## initiation 0.32432432
## non-initiation 0.04524887
##
## CONFUSION MATRIX
## initiation non-initiation class.error
## true initiation 50 24 0.32432432
## true non-initiation 10 211 0.04524887
##
## IMPORTANCE
## MeanDecreaseGini
## dev_15 17.848251
## prof_15 8.869807
## grad_15 15.273370
## plan_15 50.814374
## pca_15m_48hr 9.434156
## pca_15m_5hr 8.588192
## MODEL 2
##
##
## ERROR RATE
## error rate
## OOB 0.12585034
## initiation 0.36486486
## non-initiation 0.04545455
##
## CONFUSION MATRIX
## initiation non-initiation class.error
## true initiation 47 27 0.36486486
## true non-initiation 10 210 0.04545455
##
## IMPORTANCE
## MeanDecreaseGini
## dev_15 16.139102
## prof_15 8.830830
## grad_15 16.134941
## plan_15 50.779926
## pca_15m_48hr 9.129391
## pca_15m_5hr 9.025634
## MODEL 3
##
##
## ERROR RATE
## error rate
## OOB 0.11525424
## initiation 0.36486486
## non-initiation 0.03167421
##
## CONFUSION MATRIX
## initiation non-initiation class.error
## true initiation 47 27 0.36486486
## true non-initiation 7 214 0.03167421
##
## IMPORTANCE
## MeanDecreaseGini
## dev_15 15.801922
## prof_15 9.205357
## grad_15 16.196440
## plan_15 50.945816
## pca_15m_48hr 9.348049
## pca_15m_5hr 9.189982
## MODEL 4
##
##
## ERROR RATE
## error rate
## OOB 0.10810811
## initiation 0.35135135
## non-initiation 0.02702703
##
## CONFUSION MATRIX
## initiation non-initiation class.error
## true initiation 48 26 0.35135135
## true non-initiation 6 216 0.02702703
##
## IMPORTANCE
## MeanDecreaseGini
## dev_15 16.285885
## prof_15 8.810138
## grad_15 15.413026
## plan_15 49.604670
## pca_15m_48hr 10.334452
## pca_15m_5hr 9.862828
## MODEL 5
##
##
## ERROR RATE
## error rate
## OOB 0.10847458
## initiation 0.36486486
## non-initiation 0.02262443
##
## CONFUSION MATRIX
## initiation non-initiation class.error
## true initiation 47 27 0.36486486
## true non-initiation 5 216 0.02262443
##
## IMPORTANCE
## MeanDecreaseGini
## dev_15 17.057033
## prof_15 9.003330
## grad_15 16.464679
## plan_15 49.783096
## pca_15m_48hr 9.708581
## pca_15m_5hr 9.130698
After fitting a random forest model, we can create a probability raster by fitting the data in the analysis region to the model. We do that for each iteration, and use the average of all values.
analysisAreaVarsRaster <- terra::mask(varsRaster, analysisRegionMask)
probRasterList <- lapply(rfModelList, function(model) {
terra::predict(
analysisAreaVarsRaster,
model,
na.rm = TRUE,
type = "prob"
)[["initiation"]]
})
probRasterList <- terra::rast(probRasterList)
mean_prob_raster <- terra::app(probRasterList, mean)
min_prob_raster <- terra::app(probRasterList, min)
max_prob_raster <- terra::app(probRasterList, max)
diff_prob_raster <- max_prob_raster - min_prob_raster
rm(probRasterList)
plotRGB(terrain)
plot(shade, col=grey(0:100/100), add = TRUE)
plot(mean_prob_raster,
col = colorRampPalette(c("#56d663", "#ba5123", "#9e2509"))(25),
add = TRUE)
legend("left",
legend = c(rep(NA, 3), "0.9", rep(NA, 23), "0"),
fill = c(rep("transparent", 3), colorRampPalette(c("#9e2509", "#ba5123", "#56d663"))(25)),
y.intersp = 0.5,
border = "transparent",
title = "Mean Probability")
We can calculate a proportion raster from the probability raster, and use it to see, for example, 50% of landslides are predicted to occur in the green region plotted below.
prop_raster <- createPropRaster(mean_prob_raster)
plotRGB(terrain)
plot(shade, col=grey(0:100/100), add = TRUE)
top50 <- terra::app(prop_raster, function(x) x >= 0.5)
plot(top50, add = TRUE, col = c("transparent", "green"))
Alternatively, the other 50% of landslides are predicted to occur in the rest of the region plotted below. These are the parts of the analysis region with lower probability of landslides.
plotRGB(terrain)
plot(shade, col=grey(0:100/100), add = TRUE)
plot(top50, add = TRUE, col = c("green", "transparent"))